{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "datasets 基本使用",
   "id": "7f4061b35fdc2851"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:18:15.580057Z",
     "start_time": "2024-12-12T03:18:12.453520Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from datasets import *\n",
    "\n",
    "# 加载本地数据集\n",
    "dataset = load_dataset(\"csv\", data_files=\"./_test/data.csv\", split=\"train\")\n",
    "dataset"
   ],
   "id": "82014ad89782db7",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 7766\n",
       "})"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 1
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "按照数据集划分进行加载",
   "id": "8bf8581cf48f01ad"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-11T09:45:41.535197Z",
     "start_time": "2024-12-11T09:45:40.760323Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 获取10-100行数据\n",
    "dataset = load_dataset(\"csv\", data_files=\"./_test/data.csv\", split=\"train[10:100]\")\n",
    "dataset"
   ],
   "id": "b93e90aaeff6c5e7",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 90\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 3
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-11T09:47:04.063056Z",
     "start_time": "2024-12-11T09:47:03.275821Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 获取从0行开始，获取50%\n",
    "dataset = load_dataset(\"csv\", data_files=\"./_test/data.csv\", split=\"train[:50%]\")\n",
    "dataset"
   ],
   "id": "8dbf998397af9a29",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 3883\n",
       "})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 4
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "查看数据集",
   "id": "c9146f7a766964"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-11T09:49:20.431480Z",
     "start_time": "2024-12-11T09:49:20.408036Z"
    }
   },
   "cell_type": "code",
   "source": "dataset[\"review\"][0]",
   "id": "8dfe311f815f2fb0",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较为简单.'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 11
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "",
   "id": "a7ab948128fafdcb"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-11T09:50:47.478250Z",
     "start_time": "2024-12-11T09:50:47.463554Z"
    }
   },
   "cell_type": "code",
   "source": "dataset[\"review\"][:2]",
   "id": "752990ab6ba34497",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较为简单.',\n",
       " '商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 18
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-11T09:53:53.369476Z",
     "start_time": "2024-12-11T09:53:53.362778Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 数据集表头名称\n",
    "dataset.column_names"
   ],
   "id": "8cf38de943257f3a",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['label', 'review']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 20
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-11T09:54:36.454316Z",
     "start_time": "2024-12-11T09:54:36.445057Z"
    }
   },
   "cell_type": "code",
   "source": [
    "#数据集表头格式 \n",
    "dataset.features"
   ],
   "id": "ad571942f4a77755",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'label': Value(dtype='int64', id=None),\n",
       " 'review': Value(dtype='string', id=None)}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 21
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "数据集划分",
   "id": "dd3dae417f4647fe"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:18:23.515781Z",
     "start_time": "2024-12-12T03:18:23.485617Z"
    }
   },
   "cell_type": "code",
   "source": "dataset.train_test_split(test_size=0.1)",
   "id": "fe6b810b91b7f0ad",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['label', 'review'],\n",
       "        num_rows: 6989\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['label', 'review'],\n",
       "        num_rows: 777\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 3
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "数据集选取与过滤",
   "id": "b6d8b32fd159dbf6"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-11T09:58:10.410630Z",
     "start_time": "2024-12-11T09:58:10.399213Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 选取\n",
    "dataset.select([0, 1])"
   ],
   "id": "42f32be6e2b0fd46",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 2\n",
       "})"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 26
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-11T10:09:18.486431Z",
     "start_time": "2024-12-11T10:09:18.424254Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 过滤\n",
    "filter_dataset = dataset.filter(lambda item:\n",
    "                                item['label'] > 0 and len(item['review']) > 100\n",
    "                                )\n",
    "filter_dataset"
   ],
   "id": "840b7d37aa201fbf",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Filter:   0%|          | 0/3883 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "ad6721dcba5c4511a1e91d39a5f98eea"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 1375\n",
       "})"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 38
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
